1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.smartcrawler.retriever;
28
29 import java.io.ByteArrayOutputStream;
30 import java.io.IOException;
31 import java.io.InputStream;
32 import java.net.URL;
33 import org.apache.commons.httpclient.Header;
34 import org.apache.commons.httpclient.HostConfiguration;
35 import org.apache.commons.httpclient.HttpClient;
36 import org.apache.commons.httpclient.HttpConnectionManager;
37 import org.apache.commons.httpclient.HttpMethod;
38 import org.apache.commons.httpclient.HttpStatus;
39 import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
40 import org.apache.commons.httpclient.cookie.CookiePolicy;
41 import org.apache.commons.httpclient.methods.PostMethod;
42 import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
43 import org.apache.log4j.Logger;
44 import org.smartcrawler.common.AbstractParametrizableComponent;
45 import org.smartcrawler.common.Context;
46 import org.smartcrawler.common.Link;
47 import org.smartcrawler.common.MalformedLinkException;
48 import org.smartcrawler.common.SCLogger;
49 import org.smartcrawler.extractor.HtmlURL;
50 import org.smartcrawler.extractor.HtmlURLImpl;
51 import org.smartcrawler.extractor.LinkBuilderImpl;
52
53 /***
54 *
55 *
56 * @author <a href="mailto:pozzad@alice.it">Davide Pozza</a>
57 * @version <tt>$Revision: 1.6 $</tt>
58 */
59 public class HttpCallRetriever extends AbstractParametrizableComponent implements Retriever {
60
61 protected static Logger log = SCLogger.getLogger(HttpCallRetriever.class);
62 protected static Logger logCons = SCLogger.getConsoleLogger();
63 protected static Logger logAcc = SCLogger.getAccessLogger();
64
65 protected String host;
66
67 /***
68 * Creates a new instance of HttpRetriever
69 * @param host
70 */
71 public HttpCallRetriever() {
72 log.info("Created retriever");
73 }
74
75 protected HttpClient getHttpClient() {
76 return new HttpClient();
77 }
78
79 /***
80 *
81 * @param call
82 * @return
83 */
84 public Response execute(Call call) {
85 log.debug("getContent(): BEGIN");
86 Link link = call.getLink();
87 Response res = new Response();
88 HttpMethod m = null;
89 HostConfiguration hostConf = null;
90 try {
91 log.debug("getContent(): call.execute for " + link);
92 m = createHttpMethod(call);
93
94
95
96
97
98
99
100 HttpClient client = getHttpClient();
101 int statusCode = client.executeMethod(
102 createHostConfiguration(call), m);
103
104 res.setFound(
105 statusCode == HttpStatus.SC_OK);
106
107 res.setRedirected(
108
109 statusCode == HttpStatus.SC_NOT_MODIFIED ||
110 statusCode == HttpStatus.SC_USE_PROXY ||
111 statusCode == HttpStatus.SC_MOVED_PERMANENTLY ||
112 statusCode == HttpStatus.SC_MOVED_TEMPORARILY ||
113 statusCode == HttpStatus.SC_SEE_OTHER ||
114 statusCode == HttpStatus.SC_TEMPORARY_REDIRECT
115 );
116 log.debug("getContent(): call.execute status = "
117 + statusCode);
118
119 String logStr = link + " " + m.getStatusLine();
120 log.info(logStr);
121 logCons.info(logStr);
122 logAcc.info(logStr);
123
124 if (!res.isFound() && !res.isRedirected()) {
125 log.debug("getContent(): Method failed: "
126 + m.getStatusLine() + " invalid url: " + link);
127 logAcc.info(link + " " + m.getStatusLine());
128 logCons.info(link + " " + m.getStatusLine());
129 } else if (res.isRedirected()) {
130 res.setRedirection(getRedirLink(m, link));
131 } else {
132 res.setContent(getContent(m, link));
133 }
134
135 } catch (Exception e) {
136 log.error("getContent(): Error retrieving link: " + link
137 + " Caused by: " + e.getMessage());
138
139 logCons.info("Error retrieving link: " + link
140 + " Caused by: " + e.getMessage());
141 res.setFound(false);
142 logAcc.error(link + " ; KO; " + e.getMessage());
143 } finally {
144 try {
145 m.releaseConnection();
146 } catch (Exception e) {}
147 }
148
149 log.debug("getContent(): END");
150 return res;
151 }
152
153 /***
154 *
155 * @param call
156 * @return
157 */
158 protected HttpMethod createHttpMethod(Call call) {
159 HttpMethod m;
160 if (call.getMethod() == Call.GET)
161 m = new SmartGetMethod(call.getLink().toString());
162 else
163 m = new PostMethod(call.getLink().toString());
164 m.setFollowRedirects(false);
165 m.setRequestHeader("User-Agent", call.getUserAgent());
166
167
168 return m;
169 }
170
171 /***
172 * Factory method which creates the default host configuration
173 * @param host
174 * @return
175 */
176 protected HostConfiguration createHostConfiguration(Call call) {
177 log.debug("createHostConfiguration: BEGIN");
178 HostConfiguration hc = new HostConfiguration();
179 URL url = call.getLink().getURL();
180 hc.setHost(url.getHost(), url.getPort(), url.getProtocol());
181 log.debug("createHostConfiguration: END");
182 return hc;
183 }
184
185 /***
186 *
187 * @param m
188 * @return
189 */
190 protected String getContentType(HttpMethod m) {
191 return m.getResponseHeader("Content-Type").getValue().trim();
192 }
193
194 /***
195 *
196 * @throws org.smartcrawler.common.MalformedLinkException
197 * @return
198 */
199 public Link getRedirLink(HttpMethod m, Link referrer) throws MalformedLinkException {
200 Link redir = null;
201 Header locationHeader = m.getResponseHeader("location");
202 if (locationHeader!=null) {
203 String redirLocation = locationHeader.getValue().trim();
204 HtmlURL htmlURL = new HtmlURLImpl(redirLocation);
205 redir = (new LinkBuilderImpl(referrer)).buildLink(htmlURL);
206
207
208 }
209 return redir;
210 }
211
212 /***
213 *
214 * @param m
215 * @param link
216 * @throws java.io.IOException
217 * @return
218 */
219 public Content getContent(HttpMethod m, Link link) throws IOException {
220 InputStream in = m.getResponseBodyAsStream();
221 ByteArrayOutputStream out = new ByteArrayOutputStream();
222 byte[] buf = new byte[1024];
223 int len;
224 while ((len = in.read(buf)) > 0) {
225 out.write(buf, 0, len);
226 }
227 in.close();
228 out.close();
229 Content c = new Content();
230 c.setBuffer(out.toByteArray());
231 c.setContentType(getContentType(m));
232 c.setLink(link);
233 return c;
234 }
235
236
237 }